In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings("ignore")
In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
#pd.set_option('precision',3)

Importing Power Generation & Weather Sensor Data¶

In [3]:
generation_data = pd.read_csv("Plant_1_Generation_Data.csv")
In [4]:
generation_data.sample(5).style.set_properties(
    **{
        'background-color': 'OliveDrab',
        'color': 'white',
        'border-color': 'darkblack'
    })
Out[4]:
  DATE_TIME PLANT_ID SOURCE_KEY DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD
10508 20-05-2020 04:30 4135001 adLQvlD726eNBSB 0.000000 0.000000 0.000000 6304513.000000
54847 11-06-2020 09:00 4135001 uHbuxQJl8lW7ozc 3864.250000 379.737500 502.625000 7241010.625000
65902 16-06-2020 14:45 4135001 WRmjgnKYAwPKWDb 7727.125000 755.975000 5198.250000 7266513.250000
8416 19-05-2020 03:15 4135001 wCURE6d3bPkepu2 0.000000 0.000000 0.000000 6808393.000000
28319 29-05-2020 16:30 4135001 ih0vzX44oOqAx2f 4015.714286 394.571429 7615.857143 6294785.857000
In [5]:
weather_data = pd.read_csv("Plant_1_Weather_Sensor_Data.csv")
In [6]:
weather_data.sample(5).style.set_properties(
    **{
        'background-color': 'pink',
        'color': 'Black',
        'border-color': 'darkblack'
    })
Out[6]:
  DATE_TIME PLANT_ID SOURCE_KEY AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
3141 2020-06-17 13:45:00 4135001 HmiyD2TTLFNqkNe 29.057731 45.051337 0.599461
704 2020-05-22 22:30:00 4135001 HmiyD2TTLFNqkNe 23.547205 22.648830 0.000000
3068 2020-06-16 19:30:00 4135001 HmiyD2TTLFNqkNe 24.442285 23.068772 0.000000
3149 2020-06-17 15:45:00 4135001 HmiyD2TTLFNqkNe 23.556775 23.536192 0.085784
890 2020-05-24 22:30:00 4135001 HmiyD2TTLFNqkNe 24.650606 23.198922 0.000000

Adjust Datetime format¶

In [7]:
def parse_datetime(date_str):
    for fmt in ('%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%d-%m-%Y %H:%M', '%d-%m-%Y %H:%M:%S'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            pass
    raise ValueError(f"No valid date format found for {date_str}")

generation_data['DATE_TIME'] = generation_data['DATE_TIME'].apply(parse_datetime)
weather_data['DATE_TIME'] = weather_data['DATE_TIME'].apply(parse_datetime)

generation_data['DATE_TIME'] = generation_data['DATE_TIME'].dt.strftime('%Y-%m-%d %H:%M')
weather_data['DATE_TIME'] = weather_data['DATE_TIME'].dt.strftime('%Y-%m-%d %H:%M:%S')
In [8]:
generation_data['DATE_TIME'] = pd.to_datetime(generation_data['DATE_TIME'],format = '%Y-%m-%d %H:%M')
weather_data['DATE_TIME'] = pd.to_datetime(weather_data['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')

Merge the weather and sensor data¶

In [9]:
df_solar = pd.merge(generation_data.drop(columns = ['PLANT_ID']), weather_data.drop(columns = ['PLANT_ID', 'SOURCE_KEY']), on='DATE_TIME')
df_solar.sample(5).style.background_gradient(cmap='cool')
Out[9]:
  DATE_TIME SOURCE_KEY DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
52827 2020-06-10 10:15:00 7JYdWkrLSPkdwr4 6629.625000 649.375000 1699.625000 7796428.625000 27.128528 38.266939 0.480099
18586 2020-05-24 16:30:00 WRmjgnKYAwPKWDb 3600.500000 353.000000 7716.625000 7100742.625000 32.829294 40.124707 0.242545
46364 2020-06-07 08:45:00 ZnxXDlPa8U1GXgE 5111.000000 501.225000 585.125000 6693639.125000 23.404236 31.958346 0.341317
32196 2020-05-31 13:30:00 3PZuoBAID5Wc2HD 5162.625000 505.587500 4335.875000 7113823.875000 30.101738 48.065961 0.411193
29324 2020-05-30 04:45:00 iCRJl6heRkivqQ3 0.000000 0.000000 0.000000 7291253.000000 21.293420 20.270673 0.000000

Observation: A huge amount of power is getting lost at the inverters. Only 1/10th of DC Power is getting converted to AC Power.¶

Separate time and date columns¶

In [10]:
# adding separate time and date columns
df_solar["DATE"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.date
df_solar["TIME"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.time
df_solar['DAY'] = pd.to_datetime(df_solar['DATE_TIME']).dt.day
df_solar['MONTH'] = pd.to_datetime(df_solar['DATE_TIME']).dt.month
df_solar['WEEK'] = pd.to_datetime(df_solar['DATE_TIME']).dt.week
In [11]:
# add hours and minutes for ml models
df_solar['HOURS'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.hour
df_solar['MINUTES'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.minute
df_solar['TOTAL MINUTES PASS'] = df_solar['MINUTES'] + df_solar['HOURS']*60
In [12]:
# add date as string column
df_solar["DATE_STRING"] = df_solar["DATE"].astype(str) # add column with date as string
df_solar["HOURS"] = df_solar["HOURS"].astype(str)
df_solar["TIME"] = df_solar["TIME"].astype(str)

df_solar.head(2)
Out[12]:
DATE_TIME SOURCE_KEY DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DATE TIME DAY MONTH WEEK HOURS MINUTES TOTAL MINUTES PASS DATE_STRING
0 2020-05-15 1BY6WEcLGh8j5v7 0.0 0.0 0.0 6259559.0 25.184316 22.857507 0.0 2020-05-15 00:00:00 15 5 20 0 0 0 2020-05-15
1 2020-05-15 1IF53ai7Xc0U56Y 0.0 0.0 0.0 6183645.0 25.184316 22.857507 0.0 2020-05-15 00:00:00 15 5 20 0 0 0 2020-05-15
In [13]:
df_solar.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68774 entries, 0 to 68773
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   DATE_TIME            68774 non-null  datetime64[ns]
 1   SOURCE_KEY           68774 non-null  object        
 2   DC_POWER             68774 non-null  float64       
 3   AC_POWER             68774 non-null  float64       
 4   DAILY_YIELD          68774 non-null  float64       
 5   TOTAL_YIELD          68774 non-null  float64       
 6   AMBIENT_TEMPERATURE  68774 non-null  float64       
 7   MODULE_TEMPERATURE   68774 non-null  float64       
 8   IRRADIATION          68774 non-null  float64       
 9   DATE                 68774 non-null  object        
 10  TIME                 68774 non-null  object        
 11  DAY                  68774 non-null  int64         
 12  MONTH                68774 non-null  int64         
 13  WEEK                 68774 non-null  int64         
 14  HOURS                68774 non-null  object        
 15  MINUTES              68774 non-null  int64         
 16  TOTAL MINUTES PASS   68774 non-null  int64         
 17  DATE_STRING          68774 non-null  object        
dtypes: datetime64[ns](1), float64(7), int64(5), object(5)
memory usage: 10.0+ MB
In [14]:
df_solar.isnull().sum()
Out[14]:
DATE_TIME              0
SOURCE_KEY             0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
DATE                   0
TIME                   0
DAY                    0
MONTH                  0
WEEK                   0
HOURS                  0
MINUTES                0
TOTAL MINUTES PASS     0
DATE_STRING            0
dtype: int64
In [15]:
df_solar.shape
Out[15]:
(68774, 18)
In [16]:
# there are no nulls or blank values in the dataset
In [17]:
df_solar.describe().style.background_gradient(cmap='rainbow')
Out[17]:
  DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DAY MONTH WEEK MINUTES TOTAL MINUTES PASS
count 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000 68774.000000
mean 3147.177450 307.778375 3295.834644 6978727.511362 25.558521 31.244997 0.232305 15.762876 5.518539 22.549481 22.490621 716.515107
std 4036.441826 394.394865 3145.220597 416270.720885 3.361300 12.308283 0.301948 8.554460 0.499660 1.461138 16.772385 412.069969
min 0.000000 0.000000 0.000000 6183645.000000 20.398505 18.140415 0.000000 1.000000 5.000000 20.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 6512006.826000 22.724491 21.123944 0.000000 9.000000 5.000000 21.000000 0.000000 360.000000
50% 428.571429 41.450000 2658.473214 7146685.000000 24.670178 24.818984 0.031620 16.000000 6.000000 23.000000 15.000000 720.000000
75% 6365.468750 623.561161 6274.000000 7268751.397000 27.960429 41.693659 0.454880 23.000000 6.000000 24.000000 30.000000 1065.000000
max 14471.125000 1410.950000 9163.000000 7846821.000000 35.252486 65.545714 1.221652 31.000000 6.000000 25.000000 45.000000 1425.000000

Converting 'SOURCE_KEY' from categorical form to numerical form¶

In [18]:
#from sklearn.preprocessing import LabelEncoder
#encoder = LabelEncoder()
#df_solar['SOURCE_KEY_NUMBER'] = encoder.fit_transform(df_solar['SOURCE_KEY'])
#df_solar.head()
#df_solar
In [19]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Encode the entire column first
encoded_values = encoder.fit_transform(df_solar['SOURCE_KEY'])

# Initialize the new column with default values
df_solar['SOURCE_KEY_NUMBER'] = -1

# Process data in chunks
chunk_size = 10000  # Define a chunk size
num_chunks = (len(df_solar) // chunk_size) + 1

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    if end > len(df_solar):  # Ensure the last chunk does not exceed the length
        end = len(df_solar)
    df_solar.iloc[start:end, df_solar.columns.get_loc('SOURCE_KEY_NUMBER')] = encoded_values[start:end]

print(df_solar.head())
   DATE_TIME       SOURCE_KEY  DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  \
0 2020-05-15  1BY6WEcLGh8j5v7       0.0       0.0          0.0    6259559.0   
1 2020-05-15  1IF53ai7Xc0U56Y       0.0       0.0          0.0    6183645.0   
2 2020-05-15  3PZuoBAID5Wc2HD       0.0       0.0          0.0    6987759.0   
3 2020-05-15  7JYdWkrLSPkdwr4       0.0       0.0          0.0    7602960.0   
4 2020-05-15  McdE0feGgRqW7Ca       0.0       0.0          0.0    7158964.0   

   AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION        DATE      TIME  \
0            25.184316           22.857507          0.0  2020-05-15  00:00:00   
1            25.184316           22.857507          0.0  2020-05-15  00:00:00   
2            25.184316           22.857507          0.0  2020-05-15  00:00:00   
3            25.184316           22.857507          0.0  2020-05-15  00:00:00   
4            25.184316           22.857507          0.0  2020-05-15  00:00:00   

   DAY  MONTH  WEEK HOURS  MINUTES  TOTAL MINUTES PASS DATE_STRING  \
0   15      5    20     0        0                   0  2020-05-15   
1   15      5    20     0        0                   0  2020-05-15   
2   15      5    20     0        0                   0  2020-05-15   
3   15      5    20     0        0                   0  2020-05-15   
4   15      5    20     0        0                   0  2020-05-15   

   SOURCE_KEY_NUMBER  
0                  0  
1                  1  
2                  2  
3                  3  
4                  4  
In [20]:
file_name = 'df_solar.csv'
df_solar.to_csv(file_name, index=False)

Data Visualization¶

In [21]:
df_solar['DATE'].nunique() #the data is daily for 34 days
Out[21]:
34

1. DC & AC Power plots over all 34 days¶

In [22]:
solar_dc = df_solar.pivot_table(values='DC_POWER', index='TIME', columns='DATE')

def Daywise_plot_dc(data= None, row = None, col = None, title='DC Power'):
    cols = data.columns # take all column
    gp = plt.figure(figsize=(20,40)) 
    
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    for i in range(1, len(cols)+1):
        ax = gp.add_subplot(row,col, i)
        data[cols[i-1]].plot(ax=ax, color='red')
        ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
        
Daywise_plot_dc(data=solar_dc, row=12, col=3)
In [23]:
solar_ac = df_solar.pivot_table(values='AC_POWER', index='TIME', columns='DATE')

def Daywise_plot_ac(data= None, row = None, col = None, title='AC Power'):
    cols = data.columns # take all column
    gp = plt.figure(figsize=(20,40)) 
    
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    for i in range(1, len(cols)+1):
        ax = gp.add_subplot(row,col, i)
        data[cols[i-1]].plot(ax=ax, color='red')
        ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
        
Daywise_plot_ac(data=solar_ac, row=12, col=3)
In [24]:
def Daywise_plot_dc_ac(dc_data=None, ac_data=None, row=None, col=None, title='Power'):
    cols = dc_data.columns  # take all columns
    gp = plt.figure(figsize=(20, 40)) 
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    
    for i in range(1, len(cols) + 1):
        ax = gp.add_subplot(row, col, i)
        dc_data[cols[i - 1]].plot(ax=ax, color='red', label='DC Power')
        ac_data[cols[i - 1]].plot(ax=ax, color='green', label='AC Power')
        ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
        ax.legend()

Daywise_plot_dc_ac(dc_data=solar_dc, ac_data=solar_ac, row=12, col=3)

Observations:¶

DC Low Fluctuations: 2020-05-16, 2020-05-24, 2020-05-25, 2020-06-04¶

DC High Fluctuations: 2020-05-15, 2020-05-17, 2020-05-21, 2020-05-26, 2020-05-27, 2020-06-09, 2020-06-08, 2020-06-11, 2020-06-12, 2020-06-13¶

DC Very High Fluctuations: 2020-05-19, 2020-05-20, 2020-05-31, 2020-06-05, 2020-06-06, 2020-06-17¶

We can see that AC Power follows the same trend as DC Power because it is generated by DC power but it is 1/10th of the DC Power¶

2. DC Power & AC Power generated per day, sorted by descending order¶

In [25]:
daily_dc = df_solar.groupby('DATE')['DC_POWER'].agg('sum')

ax = daily_dc.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='red')
plt.title('Daily DC Power')
plt.show()
In [26]:
daily_ac = df_solar.groupby('DATE')['AC_POWER'].agg('sum')

ax = daily_ac.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='red')
plt.title('Daily AC Power')
plt.show()
In [27]:
# Combine both daily sums into a single DataFrame
daily_power = pd.DataFrame({'DC_POWER': daily_dc, 'AC_POWER': daily_ac})

# Plot the grouped bar plot
ax = daily_power.sort_values(by='DC_POWER', ascending=False).plot.bar(
    figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily DC and AC Power')
plt.xlabel('Date')
plt.ylabel('Power')
plt.legend(['DC Power', 'AC Power'])
plt.show()

Observations:¶

Highest DC_POWER Generation is on: 2020-05-25¶

Lowest DC_POWER Generation is on : 2020-05-18¶

3. Irradiation Plots over all 34 days¶

In [28]:
solar_irradiation = df_solar.pivot_table(values='IRRADIATION', index='TIME', columns='DATE')

def Daywise_plot(data= None, row = None, col = None, title='IRRADIATION'):
    cols = data.columns # take all column
    gp = plt.figure(figsize=(20,40)) 
    
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    for i in range(1, len(cols)+1):
        ax = gp.add_subplot(row,col, i)
        data[cols[i-1]].plot(ax=ax, color='blue')
        ax.set_title('{} {}'.format(title, cols[i-1]),color='blue')
        
Daywise_plot(data=solar_irradiation, row=12, col=3)
In [29]:
def Daywise_plot_dc_irradiation(dc_data=None, irrad_data=None, row=None, col=None, title='DC Vs Irradiation'):
    cols = dc_data.columns  # take all columns
    gp = plt.figure(figsize=(20, 40)) 
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    
    for i in range(1, len(cols) + 1):
        ax = gp.add_subplot(row, col, i)
        dc_data[cols[i - 1]].plot(ax=ax, color='red', label='DC Power')
        irrad_data[cols[i - 1]].plot(ax=ax, color='green', label='Irradiation')
        ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
        ax.legend()

Daywise_plot_dc_irradiation(dc_data=solar_dc, irrad_data=solar_irradiation, row=12, col=3)
In [30]:
daily_irradiation = df_solar.groupby('DATE')['IRRADIATION'].agg('sum')

daily_irradiation.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='blue')
plt.title('IRRADIATION')
plt.show()
In [31]:
# Combine both daily sums into a single DataFrame
daily_dcpower_irrad = pd.DataFrame({'DC_POWER': daily_dc, 'IRRADIATION': daily_irradiation})

# Plot the grouped bar plot
ax = daily_power.sort_values(by='DC_POWER', ascending=False).plot.bar(
    figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily DC Power and Irradiation')
plt.xlabel('Date')
plt.ylabel('Power Vs Irradiation')
plt.legend(['DC Power', 'Irradiation'])
plt.show()

Observations:¶

Highest Irradiation is on: 2020-05-25¶

Lowest Irradiation is on : 2020-06-18¶

these graphs are similar to DC_Output graphs because irradiation is the amount of sunlight that falls on unit m2. this is directly proportional to the dc output produced. Huge amounts of DC power is generated with very little irradiation¶

4. Ambient & Module Temperature plots¶

In [32]:
#sns.displot(data=df_solar, x="AMBIENT_TEMPERATURE", kde=True, bins = 100,color = "red", facecolor = "#3F7F7F",height = 5, aspect = 3.5);
In [33]:
solar_ambiant_temp = df_solar.pivot_table(values='AMBIENT_TEMPERATURE', index='TIME', columns='DATE')
solar_module_temp = df_solar.pivot_table(values='MODULE_TEMPERATURE', index='TIME', columns='DATE')

def Daywise_plot_am_mo(am_data=None, mo_data=None, row=None, col=None, title='Ambient Vs Module'):
    cols = am_data.columns  # take all columns
    gp = plt.figure(figsize=(20, 40)) 
    gp.subplots_adjust(wspace=0.2, hspace=0.5)
    
    for i in range(1, len(cols) + 1):
        ax = gp.add_subplot(row, col, i)
        am_data[cols[i - 1]].plot(ax=ax, color='red', label='Ambient Temperature')
        mo_data[cols[i - 1]].plot(ax=ax, color='green', label='Module Temperature')
        ax.set_title('{} {}'.format(title, cols[i - 1]), color='blue')
        ax.legend()

Daywise_plot_am_mo(am_data=solar_ambiant_temp, mo_data=solar_module_temp, row=12, col=3)
In [34]:
daily_am = df_solar.groupby('DATE')['AMBIENT_TEMPERATURE'].agg('sum')
daily_mo = df_solar.groupby('DATE')['MODULE_TEMPERATURE'].agg('sum')

#daily_ambient_temp.sort_values(ascending=False).plot.bar(figsize=(17,5), legend=True,color='darkgreen')
#plt.title('AMBIENT_TEMPERATURE')
#plt.show()

daily_ambient_module_temp = pd.DataFrame({'AMBIENT_TEMPERATURE': daily_am, 'MODULE_TEMPERATURE': daily_mo})

# Plot the grouped bar plot
ax = daily_ambient_module_temp.sort_values(by='MODULE_TEMPERATURE', ascending=False).plot.bar(
    figsize=(17, 5), color=['red', 'green']
)
plt.title('Daily Ambient and Module Temperature')
plt.xlabel('Date')
plt.ylabel('Ambient Vs Module')
plt.legend(['Ambient', 'Module'])
plt.show()

Observation:¶

Module temperature is much higher than Ambient Temperature so probably Module temperature is not highly influenced by Ambient Temperature¶

5. DC Power /AC Power/ Irradiation / Ambient Vs Module Temperature¶

Highest DC_POWER is generated on "2020-05-25"¶

In [35]:
plt.figure(figsize=(16,16))

date=["2020-05-25"]

plt.subplot(411)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
             y=df_solar[df_solar["DATE_STRING"].isin(date)].DC_POWER,
             label="DC_Power_Best",
             color='green')

plt.subplot(412)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
             y=df_solar[df_solar["DATE_STRING"].isin(date)].AC_POWER,
             label="AC_Power_Best",
             color='green')

plt.subplot(413)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].IRRADIATION, 
             label="Irridation_Best",
             color='green');
plt.title("Irradiation : {}" .format(date[0]))

plt.subplot(414)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].AMBIENT_TEMPERATURE, 
             label="Ambient_Temperature_Best",
             color='green');
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].MODULE_TEMPERATURE, 
             label="Module_Temperature_Best",
             color='blue');
plt.title("Module Temperature & Ambient Temperature: {}" .format(date[0]));

plt.tight_layout()
plt.show()

NOTE: Both DC_POWER graph and IRRADIATION graph is almost looking like an ideal graph which is explained earlier. Weather is also looking good, and there is no cloud is in the sky because there is very less variation in IRRADIATION and temperature of the solar panel and ambient temperature.¶

Lowest average DC_POWER is generated on "2020-05-18"¶

In [36]:
date=["2020-05-18"]
plt.figure(figsize=(16,16))

plt.subplot(411)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].DC_POWER, 
             label="DC_Power_Worst",
             color='red');
plt.title("DC Power Generation: {}" .format(date[0]))

plt.subplot(412)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME,
             y=df_solar[df_solar["DATE_STRING"].isin(date)].AC_POWER,
             label="AC_Power_Worst",
             color='red')


plt.subplot(413)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].IRRADIATION, 
             label="Irridation_Worst",
             color='red');
plt.title("Irradiation : {}" .format(date[0]))

plt.subplot(414)
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].AMBIENT_TEMPERATURE, 
             label="Ambient_Temperature_Worst",
             color='red');
sns.lineplot(x=df_solar[df_solar["DATE_STRING"].isin(date)].DATE_TIME, 
             y=df_solar[df_solar["DATE_STRING"].isin(date)].MODULE_TEMPERATURE, 
             label="Module_Temperature_Worst",
             color='blue');
plt.title("Module Temperature & Ambient Temperature: {}" .format(date[0]));

plt.tight_layout()
plt.show()

6. AC Power vs Inverter Plot for a particular date and time¶

In [37]:
df_solar['SOURCE_KEY'].unique()
Out[37]:
array(['1BY6WEcLGh8j5v7', '1IF53ai7Xc0U56Y', '3PZuoBAID5Wc2HD',
       '7JYdWkrLSPkdwr4', 'McdE0feGgRqW7Ca', 'VHMLBKoKgIrUVDU',
       'WRmjgnKYAwPKWDb', 'ZnxXDlPa8U1GXgE', 'ZoEaEvLYb1n2sOq',
       'adLQvlD726eNBSB', 'bvBOhCH3iADSZry', 'iCRJl6heRkivqQ3',
       'ih0vzX44oOqAx2f', 'pkci93gMrogZuBj', 'rGa61gmuvPhdLxV',
       'sjndEbLyjtCKgGv', 'uHbuxQJl8lW7ozc', 'wCURE6d3bPkepu2',
       'z9Y9gH1T5YWrNuG', 'zBIq5rxdHJRwDNY', 'zVJPv84UY57bAof',
       'YxYtjZvoooNbGkE'], dtype=object)
In [38]:
filter_date = '2020-06-06 08:30:00'

filtered_df = df_solar[df_solar['DATE_TIME'] == filter_date]

# Create the plot
plt.figure(figsize=(14, 8))

# Plotting DC Power Output
plt.plot(filtered_df['SOURCE_KEY'], filtered_df['DC_POWER'], marker='o', linestyle='-', color='g', label='DC Power Input')

# Plotting AC Power Output
plt.plot(filtered_df['SOURCE_KEY'], filtered_df['AC_POWER'], marker='o', linestyle='-', color='r', label='AC Power Output')

# Adding title and labels
plt.title(f'Power vs Inverter ID on {filter_date}')
plt.xlabel('Inverter ID')
plt.ylabel('Power (kW)')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add legend
plt.legend()

# Display the plot
plt.grid(True)
plt.tight_layout()
plt.show()

Observation:¶

Here we can see that every inverter performs badly in converting DC to AC power efficiently.¶

Solar Power Plant Inverter Efficiency Calculation¶

In [39]:
solar_dc_power = df_solar[df_solar['DC_POWER'] > 0]['DC_POWER'].values
solar_ac_power = df_solar[df_solar['AC_POWER'] > 0]['AC_POWER'].values
In [40]:
solar_plant_eff = (np.max(solar_ac_power)/np.max(solar_dc_power ))*100
print(f"Power ratio AC/DC (Efficiency) of Solar Power Plant:  {solar_plant_eff:0.3f} %")
Power ratio AC/DC (Efficiency) of Solar Power Plant:  9.750 %

Observation:¶

The efficiency of the plant is very poor. This is likely due to the fact that DC power is not getting converted to AC Power efficiently.¶

In [41]:
AC_list=[]
for i in df_solar['AC_POWER']:
    if i>0:
        AC_list.append(i)
AC_list
#AC_list.sort()
#AC_list.reverse()
len(AC_list)
Out[41]:
36823
In [42]:
DC_list=[]
for i in df_solar['DC_POWER']:
    if i>0:
        DC_list.append(i)
DC_list
DC_list.sort()
DC_list.reverse()
len(DC_list)
Out[42]:
36823
In [43]:
plt.figure(figsize=(16,8))
AC_list.sort()
DC_list.sort()
#print(DC_list)
#DC_list.sort
#res = [i / 10 for i in AC_list]
eff = [i/j for i,j in zip(AC_list,DC_list)]

plt.plot(AC_list,eff,color='green')
plt.xlabel('Output power in kW')
plt.ylabel('efficiency AC/DC')
plt.title('Output power vs efficiency');

Q1. Can we identify the need for panel cleaning/maintenance?¶

The Module temperature is usually quite high but is not getting highly influence by the ambient temperature. This has been observed over all 34 days in Plant 2. This can lead to low efficiency. Similar behaviour has not been observed in Plant 1. In Plant 1, the ambient temperature and module temperature values are seen to be similar. The module temperature (actual temperature of the solar panel) being this high can point to cleaning/maintenance requirement.¶

Q2. Can we identify faulty or suboptimally performing equipment?¶

As we have observed through the AC/DC plots, there is a huge power loss at the inverter level. The AC output is almost 1/10th of the DC input. There is an issue with every inverter because this is the case with every AC observation. This may point to faulty equipment.¶

Q3. Can we predict the power generation for the next couple of days?¶

In [44]:
df2 = df_solar.copy()
X = df2[['DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','DC_POWER']]
y = df2['AC_POWER']
In [45]:
X.head()
Out[45]:
DAILY_YIELD TOTAL_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION DC_POWER
0 0.0 6259559.0 25.184316 22.857507 0.0 0.0
1 0.0 6183645.0 25.184316 22.857507 0.0 0.0
2 0.0 6987759.0 25.184316 22.857507 0.0 0.0
3 0.0 7602960.0 25.184316 22.857507 0.0 0.0
4 0.0 7158964.0 25.184316 22.857507 0.0 0.0
In [46]:
y.head()
Out[46]:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: AC_POWER, dtype: float64
In [47]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=21)

Linear Regression¶

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
score_lr = 100*lr_clf.score(X_test,y_test)
print(f'LR Model score = {score_lr:4.4f}%')
LR Model score = 99.9995%
In [49]:
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
R2_Score_lr = round(r2_score(y_pred_lr,y_test) * 100, 2)

print("R2 Score : ",R2_Score_lr,"%")
R2 Score :  100.0 %

Random Forest¶

In [50]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
y_pred_rfr = rfr.predict(X_test)
R2_Score_rfr = round(r2_score(y_pred_rfr,y_test) * 100, 2)

print("R2 Score : ",R2_Score_rfr,"%")
R2 Score :  100.0 %

Decision Tree¶

In [51]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)

y_pred_dtr = rfr.predict(X_test)
R2_Score_dtr = round(r2_score(y_pred_dtr,y_test) * 100, 2)

print("R2 Score : ",R2_Score_dtr,"%")
R2 Score :  100.0 %

Result predictions¶

In [52]:
prediction = rfr.predict(X_test)
print(prediction)
[   0.         1071.24303578  299.62135714 ...  669.40498213  377.82001786
  117.4835    ]
In [53]:
cross_checking = pd.DataFrame({'Actual' : y_test , 'Predicted' : prediction})
cross_checking.head()
Out[53]:
Actual Predicted
43819 0.0000 0.000000
2949 1072.3250 1071.243036
33769 299.8125 299.621357
47825 0.0000 0.000000
29370 0.0000 0.000000
In [54]:
cross_checking['Error'] = cross_checking['Actual'] - cross_checking['Predicted']
cross_checking.head()
Out[54]:
Actual Predicted Error
43819 0.0000 0.000000 0.000000
2949 1072.3250 1071.243036 1.081964
33769 299.8125 299.621357 0.191143
47825 0.0000 0.000000 0.000000
29370 0.0000 0.000000 0.000000
In [55]:
cross_checking_final  = cross_checking[cross_checking['Error'] <= 20]
cross_checking_final.sample(25).style.background_gradient(
        cmap='coolwarm').set_properties(**{
            'font-family': 'Lucida Calligraphy',
            'color': 'LigntGreen',
            'font-size': '15px'
        })
 	
Out[55]:
  Actual Predicted Error
66617 0.000000 0.000000 0.000000
19323 0.000000 0.000000 0.000000
22096 817.942857 817.714625 0.228232
51487 0.000000 0.000000 0.000000
28800 0.000000 0.000000 0.000000
37613 0.000000 0.000000 0.000000
64553 0.000000 0.000000 0.000000
68601 0.000000 0.000000 0.000000
812 556.675000 556.546560 0.128440
41193 0.000000 0.000000 0.000000
51656 0.000000 0.000000 0.000000
40767 842.975000 842.777032 0.197968
15312 0.000000 0.000000 0.000000
68029 90.300000 90.308000 -0.008000
20974 0.000000 0.000000 0.000000
22593 498.537500 498.462643 0.074857
54320 0.000000 0.000000 0.000000
55268 666.642857 666.609250 0.033607
15992 838.875000 839.413482 -0.538482
26192 618.342857 618.319089 0.023768
21612 0.000000 0.000000 0.000000
5465 86.214286 86.242732 -0.028446
37832 0.000000 0.000000 0.000000
36256 628.237500 628.244768 -0.007268
40239 673.285714 673.324286 -0.038571
In [56]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the MLP model
mlp_model = Sequential()
mlp_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
mlp_model.add(Dense(32, activation='relu'))
mlp_model.add(Dense(1))

# Compile the model
mlp_model.compile(loss='mse', optimizer='adam')

# Train the model
mlp_model.fit(X_train_scaled, y_train, epochs=50, batch_size=10, verbose=1)

# Make predictions
y_pred_mlp = mlp_model.predict(X_test_scaled)

# Evaluate the model
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)
print(f'MLP Model - MSE: {mse_mlp}, R2 Score: {r2_mlp}')
Epoch 1/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 277us/step - loss: 49272.4258
Epoch 2/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 17.8594
Epoch 3/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 1.5824
Epoch 4/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 306us/step - loss: 1.3549
Epoch 5/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 1.9260
Epoch 6/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 0.8639
Epoch 7/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 264us/step - loss: 1.1559
Epoch 8/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 275us/step - loss: 1.3298
Epoch 9/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 1.7006
Epoch 10/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 1.3854
Epoch 11/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 275us/step - loss: 1.3155
Epoch 12/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.9738
Epoch 13/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 0.7780
Epoch 14/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.9318
Epoch 15/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 0.9338
Epoch 16/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.8152
Epoch 17/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 265us/step - loss: 0.6181
Epoch 18/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 1.6709
Epoch 19/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 267us/step - loss: 0.9457
Epoch 20/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 294us/step - loss: 1.6886
Epoch 21/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 276us/step - loss: 1.5974
Epoch 22/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 0.6508
Epoch 23/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 266us/step - loss: 1.4363
Epoch 24/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 267us/step - loss: 1.3786
Epoch 25/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 279us/step - loss: 0.9376
Epoch 26/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.8532
Epoch 27/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.9270
Epoch 28/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.7021
Epoch 29/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.8422
Epoch 30/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 1.5426
Epoch 31/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.9313
Epoch 32/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 0.8556
Epoch 33/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.1805
Epoch 34/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 280us/step - loss: 0.7001
Epoch 35/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 271us/step - loss: 0.7493
Epoch 36/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 299us/step - loss: 1.5413
Epoch 37/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 288us/step - loss: 0.7257
Epoch 38/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.6937
Epoch 39/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.8396
Epoch 40/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 271us/step - loss: 0.7010
Epoch 41/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 282us/step - loss: 0.7524
Epoch 42/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 0.9560
Epoch 43/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 0.5954
Epoch 44/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 280us/step - loss: 0.8679
Epoch 45/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 268us/step - loss: 1.2432
Epoch 46/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.5490
Epoch 47/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 0.8518
Epoch 48/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 269us/step - loss: 1.0041
Epoch 49/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 1s 270us/step - loss: 1.1406
Epoch 50/50
5502/5502 ━━━━━━━━━━━━━━━━━━━━ 2s 281us/step - loss: 0.4706
430/430 ━━━━━━━━━━━━━━━━━━━━ 0s 237us/step
MLP Model - MSE: 0.17525856311964316, R2 Score: 0.999998865808189
In [ ]: